Laptop Specifications Data Cleaning
Transformed raw laptop specs into analysis-ready data using SQL
Project Overview
Cleaned 1,200+ laptop specifications from various manufacturers. Performed complex data transformations including feature extraction, data type standardization, and quality assurance checks.
Data Transformation Process
1. Initial Cleaning
- Duplicate removal
- Backup creation
2. Feature Engineering
- GPU/CPU brand extraction
- Screen resolution parsing
- Memory type classification
3. Normalization
- Data type conversion
- Column splitting
- Unit standardization
Sample Results
Web to Table: Data Before and After
Data Cleaning Process Steps
Database Selection
USE db;
Data Backup (DDL + DML)
# Take Data backup
CREATE TABLE laptop_backup LIKE laptopdata;
INSERT INTO laptop_backup
SELECT * FROM laptopdata;
Storage Size Check (DQL)
# byte/1024 = KB
SELECT Data_length/1024 FROM information_schema.Tables
WHERE Table_schema = 'db' AND Table_name = 'laptopdata';
Column Renaming (DDL)
# Change column name Unnamed: 0 to index
ALTER TABLE laptopdata RENAME COLUMN `Unnamed: 0` TO `index`;
Null Values Check (DQL)
SELECT *
FROM laptopdata l
WHERE Company IS NULL OR TypeName IS NULL OR Inches IS NULL
OR ScreenResolution IS NULL OR Cpu IS NULL OR Ram IS NULL
OR Memory IS NULL OR Gpu IS NULL OR OpSys IS NULL
OR Weight IS NULL OR Price IS NULL OR l.index IS NULL;
Duplicate Management (DML + DQL)
# Distinct index count
SELECT COUNT(DISTINCT(l.index)) FROM laptopdata l;
# Delete duplicates
DELETE FROM laptopdata l
WHERE l.index IN (
SELECT t.index FROM (
SELECT l.index,
ROW_NUMBER() OVER(PARTITION BY Company, TypeName, Inches,
ScreenResolution,Cpu, Ram, Memory,
Gpu, OpSys, Weight, Price) r
FROM laptopdata l) t
WHERE r > 1);
# Verify duplicates
SELECT t.index FROM (
SELECT l.index,
ROW_NUMBER() OVER(PARTITION BY Company, TypeName, Inches,
ScreenResolution,Cpu, Ram, Memory,
Gpu, OpSys, Weight, Price) r
FROM laptop l) t
WHERE r > 1;
RAM Column Cleaning (DML + DDL)
UPDATE laptopdata
SET Ram = REPLACE(Ram,'GB','');
ALTER TABLE laptopdata MODIFY COLUMN ram INTEGER;
Weight Column Cleaning (DML + DDL)
UPDATE laptopdata
SET weight = REPLACE(weight,'kg','');
# Handle non-numeric values
UPDATE laptopdata
SET weight = REPLACE(weight,'?','0');
ALTER TABLE laptopdata MODIFY COLUMN weight DECIMAL(10,3);
Price Standardization (DML + DDL)
UPDATE laptopdata
SET Price = ROUND(price);
ALTER TABLE laptopdata MODIFY COLUMN price INTEGER;
OS Standardization (DML)
UPDATE laptopdata
SET OpSys = CASE
WHEN OpSys LIKE '%mac%' THEN 'macos'
WHEN OpSys LIKE '%Windows%' THEN 'windows'
WHEN OpSys LIKE '%Linux%' THEN 'linux'
WHEN OpSys LIKE '%Chrome%' THEN 'chromeOS'
WHEN OpSys LIKE '%Android%' THEN 'androidOS'
WHEN OpSys LIKE '$No OS%' THEN 'n/a'
END;
GPU Feature Extraction (DDL + DML)
ALTER TABLE laptopdata
ADD COLUMN Gpu_brand VARCHAR(255) AFTER Gpu,
ADD COLUMN Gpu_name VARCHAR(255) AFTER Gpu_brand;
UPDATE laptopdata
SET Gpu_brand = SUBSTRING_INDEX(Gpu,' ',1),
Gpu_name = REPLACE(Gpu,Gpu_brand,'');
ALTER TABLE laptopdata DROP COLUMN Gpu;
CPU Feature Engineering (DDL + DML)
ALTER TABLE laptopdata
ADD COLUMN Cpu_brand VARCHAR(255) AFTER Cpu,
ADD COLUMN Cpu_name VARCHAR(255) AFTER Cpu_brand,
ADD COLUMN Cpu_speed VARCHAR(255) AFTER Cpu_model;
UPDATE laptopdata
SET Cpu_brand = SUBSTRING_INDEX(Cpu,' ',1),
Cpu_name = SUBSTRING_INDEX(REPLACE(Cpu,Cpu_brand, ''),Cpu_speed,1),
Cpu_speed = REPLACE(SUBSTRING_INDEX(Cpu,' ',-1), 'GHz', '');
ALTER TABLE laptopdata DROP COLUMN cpu;
Screen Resolution Parsing (DDL + DML)
ALTER TABLE laptopdata
ADD COLUMN Resolution_width INT AFTER ScreenResolution,
ADD COLUMN Resolution_height INT AFTER Resolution_width,
ADD COLUMN Touch_screen INT AFTER Resolution_height;
UPDATE laptopdata
SET Resolution_width = SUBSTRING_INDEX(SUBSTRING_INDEX(ScreenResolution, ' ',-1),'x',1),
Resolution_height = SUBSTRING_INDEX(SUBSTRING_INDEX(ScreenResolution, ' ',-1),'x',-1),
Touch_screen = CASE WHEN ScreenResolution LIKE '%Touchscreen%' THEN 1 ELSE 0 END;
ALTER TABLE laptopdata DROP COLUMN ScreenResolution;
Memory Column Normalization (DDL + DML)
ALTER TABLE laptopdata
ADD COLUMN memory_type VARCHAR(255) AFTER memory,
ADD COLUMN primary_storage INT AFTER memory_type,
ADD COLUMN secondary_storage INT AFTER primary_storage;
UPDATE laptopdata
SET memory_type = CASE
WHEN Memory LIKE '%SSD%' AND Memory LIKE '%HDD%' THEN 'Hybrid'
WHEN Memory LIKE '%SSD%' THEN 'SSD'
WHEN Memory LIKE '%HDD%' THEN 'HDD'
WHEN Memory LIKE '%Flash%' THEN 'Flash'
ELSE NULL
END;
UPDATE laptopdata
SET primary_storage = REGEXP_SUBSTR(SUBSTRING_INDEX(memory,'+',1), '[0-9]+'),
secondary_storage = CASE
WHEN memory LIKE '%+%' THEN REGEXP_SUBSTR(SUBSTRING_INDEX(memory,'+',-1), '[0-9]+')
ELSE 0
END;
UPDATE laptopdata
SET secondary_storage = CASE
WHEN secondary_storage <= 2 THEN secondary_storage * 1024
ELSE secondary_storage
END;
ALTER TABLE laptopdata DROP COLUMN memory;
CPU Name Simplification (DML)
UPDATE laptopdata
SET Cpu_name = SUBSTRING_INDEX(cpu_name, ' ',3);
Quality Metrics
| Metric | Before | After |
|---|---|---|
| Duplicate Records | 29 | 0 |
| Null Values | 0 | 0 |
| Column Consistency | 40% | 100% |
Final Schema Design
DESCRIBE laptopdata;
+---------------------+--------------+------+-----+---------+-------+
| Field | Type | Null | Key | Default | Extra |
+---------------------+--------------+------+-----+---------+-------+
| index | INT | YES | | NULL | |
| Company | VARCHAR(255) | YES | | NULL | |
| TypeName | VARCHAR(255) | YES | | NULL | |
| Inches | DECIMAL(5,2) | YES | | NULL | |
| Resolution_width | INT | YES | | NULL | |
| Resolution_height | INT | YES | | NULL | |
| Touch_screen | INT | YES | | NULL | |
| Cpu_brand | VARCHAR(50) | YES | | NULL | |
| Cpu_name | VARCHAR(255) | YES | | NULL | |
| Ram | INT | YES | | NULL | |
| ... | ... | ... | ... | ... | ... |
+---------------------+--------------+------+-----+---------+-------+
Applications Enabled
- Price-performance analysis
- Manufacturer comparison
- Laptop recommendation systems
- Market trend analysis